#! -*- coding:utf-8 -*-
import re
import pymongo
import codecs,sys
from pymongo import MongoClient
import jieba
from gensim import corpora, models, similarities
import jieba.analyse
from pprint import pprint  # pretty-printer
import htmldb

reload(sys)
sys.setdefaultencoding('utf-8')

dictpath="dicts.txt"
sentence = u'李小福是创新办主任也是云计算方面的专家'
#jieba.load_userdict(dictpath)
#jieba.suggest_freq((u'专家'), False)
words= jieba.cut(sentence) #,cut_all=True)
print ','.join(words)

'''
匹配【】
'''
text = open("liuduzhou.txt").readlines()
print len(text)
result=''
title=''
head=''
for line in text:
    if (len(line.strip())<1):
        continue
    if re.match(r'【[0-9]+】(.+)',line):
        if (len(title)>0):
            print head, title, result
            htmldb.saveto_mongo("liuduzhou", result, title)
        mat = re.match(r'【[0-9]+】(.+)',line)
        title=mat.group(1)
        head=line
        result=''
    else:
        result+=line
print head, title, result
htmldb.saveto_mongo("liuduzhou",result,title)

